home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Workbench Add-On
/
Workbench Add-On - Volume 1.iso
/
BBS-Archive
/
DiskUtil
/
Crunch
/
XPK.lha
/
XPK
/
DEVELOPER
/
RDCN
/
DECOMPR.ASM
< prev
next >
Wrap
Assembly Source File
|
1993-05-01
|
8KB
|
225 lines
;---------------------------------------------------------------------------;
; ;
; This is the Ross Data Compression implemented in 68000 assembler ;
; by Niklas Sjöberg, 2:203/415.3@Fidonet ;
; ;
; ;
; The source is fully in the Public Domain. If you think you can improve ;
; some parts, feel free to modify the code. HOWEVER, be carefull to comment ;
; the parts where you change! ;
; ;
; On entry: ;
; a0 points to XpkSubParams ;
;---------------------------------------------------------------------------;
; 921205 - Due to optimization comments may seem a bit weird.. Just try
; to read 5-10 lines at the time I'm sure you'll get it right..
; 930319 - Some optimizations by John Harris - jharris@cup.portal.com
; All new opts marked with a '@'. Cycle savings at 68000 level:
; mloop is 8-10 cycles faster
; path to s_loop is 18 cycles faster (outside the loop)
; sr_loop is 32 cycles faster (outside)
; lr_loop is about 172 cycles faster, plus loop was changed to
; .l for 50 cycles faster every 4 bytes past cnt=23
; rl_loop is about 186 cycles faster, plus if src & dest are
; either both even or both odd, the loop will use .l
; for 58 cycles faster for every 4 bytes past cnt=19
; 930409 - (Niklas Sjoberg) Fixed some lethal bugs. Might take a few more
; cycles but decompression won't be safe otherwise!
; 930418 - (John Harris) There were ways to fix the bugs in the routine
; without losing any cycles. Some other things had been changed by
; Niklas, that have been put back to the way I originally wrote them.
; - In Niklas's translation of my code, a couple of inadvertant changes
; were made that caused the routine to break on the 68000. These were
; restored. This will restore the speed to its full potential as well.
; - Fixed one additional guru
xdef _unpack ; The only function called from
; the C-part of the library
INCLUDE "libraries/xpksub.i"
section code
_unpack
movem.l a2-a6/d2-d7,-(SP)
move.l xsp_InBuf(a0),a1 ; source pointer
move.l xsp_InLen(a0),d1 ; source len
move.l xsp_OutBuf(a0),a2 ; destination
move.l a1,a3 ; inbuff_idx = inbuff
move.l a2,a4 ; outbuff_idx = outbuff
move.l a1,a5 ; inbuff_end = inbuff
adda.l d1,a5 ; + inbuff_len
moveq #0,d0
moveq #0,d2
moveq #0,d3
moveq #0,d4
moveq #0,d5
moveq #0,d6
moveq #0,d7
bra.s mloop ; @ New line. mloop was restructured
; ;to remove one branch, and adjust the others to
; ;fall through most often instead of branching.
; ; loopend moved here to make 1 branch short
loopend
suba.l a2,a4 ; This is the only exit point
move.l a4,xsp_OutBufLen(a0) ; return length to xpkmaster
movem.l (SP)+,a2-a6/d2-d7
moveq #0,d0
rts
new_ctrlbits ; @ New label, and moved routine
; move.w (a3)+,d2 ; new load ctrl-bits
move.b (a3)+,-(SP) ; Slightly faster than shifting
move.w (SP)+,d2 ; and works on 68000....
move.b (a3)+,d2
;
moveq #15,d3 ; ctrl_mask = 0x8000
; This is ugly. Since (in worst case) we may overrun the buffer by
; 15 bytes by not checking source and source-end. However, empty
; control-word means that we only copy 15 in worst case. Hell, we
; have 256 XPK_MARGIN bytes to play with :=) (I hope...)
cmp.l a5,a3 ; main-loop. Go on until end of
bge.s loopend ; source buffer
btst d3,d2 ; @ These two lines are duplicated
bne.s crunched ; for branch efficiency
copy_char ; @ New label
move.b (a3)+,(a4)+ ; otherwise, copy char and advance
; bra.s mloop ; @ This line removed
mloop
subq.b #1,d3
; lsr.w #1,d3 ; Check if we need to get a load of
; control bits
bmi.s new_ctrlbits ; @ Changed branch priority
; move.w d2,d7 ; If bits & mask turns out to be one
; and.w d3,d7 ; then data is compressed
btst d3,d2
beq.s copy_char ; @ Changed branch priority
crunched
; ;@d4 only needs to be word extended for s_loop,
; ;so I removed moveq #0 and added ext.w at s_loop
; moveq #0,d4 ; First find out what compressions method
; that was used.
move.b (a3)+,d4 ; d4 will eventually contain 0-15
move.w d4,d5 ; d5, contains count (further count may
and.w #$f,d5 ; @ (.w) be needed for some methods.
lsr.b #4,d4 ; The higher 4 bits of d4 is type, and
; and.w #$f,d4 ; the lower count
; cmpi.b #2,d4 ; @cmd > 2 then it's a short pattern
subq.b #2,d4 ; @subq is faster than cmpi -- we just
; ; have to adjust two later uses
bls.s no_spat ; (3-15)
; move.w d5,d6 ; count is 3 chars higher actually
addq.b #3,d5 ; d5 is never >15+3
moveq #0,d7 ; to use d4 as loop-register
move.b (a3)+,d7 ; decode offset
asl.w #4,d7
add.w d5,d7
move.l a4,a6 ; Next copy d4 characters from a6
sub.l d7,a6 ; to a4
; subq.b #1,d4 ; @
; ; @ This line would get adjusted to
; ; addq #1 to compensate for subq #2,
move.b (a6)+,(a4)+ ; @ but I'll just add one unrolled move
; ; which does the same thing, faster
ext.w d4 ; @ since earlier moveq was removed
s_loop move.b (a6)+,(a4)+ ;
dbf d4,s_loop
bra.s mloop ; Jump to main loop
no_spat addq.b #1,d4 ; @ d4 now equals -1, 0, or 1. All
; ; further checks can be made with
; ; no more compares needed.
bpl.s no_srle ; @ d4=-1 for srle
; addq.b #2,d5 ; @Remove cnt += 3 (-1 for dbf loop)
move.b (a3)+,d7 ; char to repeat d5 times
move.b d7,(a4)+ ; @ Replaced addq.b #2 with two unrolled
move.b d7,(a4)+ ; moves. 2 bytes longer, but faster
sr_loop move.b d7,(a4)+ ; since we save 2 loop iterations
dbf d5,sr_loop
bra.s mloop ; Main loop
no_srle ; @ remove cmp #1
bne.s no_lrle
moveq #0,d7 ; d4=0 for lrle
move.b (a3)+,d7 ; if so, decode high/low count
asl.w #4,d7 ; and add them since long RLE
add.w d7,d5 ; is least 19 :
; addi.w #18,d5 ; @ Remove cnt +=19 (-1 for dbf loop)
move.b (a3),d7 ; @ Build d7 to longword size
lsl.w #8,d7 ; @
move.b (a3)+,d7 ; char to set
move.w d7,d4 ; @
swap d7 ; @
move.w d4,d7 ; @ d7 has chr duplicated in all 4 bytes
move.w a4,d4 ; @ Get a4 to an even adr, so we can
lsr.b #1,d4 ; @ use move.l's.
bcc.s a4_even ; @
move.b d7,(a4)+ ; @ a4 was odd, so move one odd byte
subq.w #1,d5 ; @ ** Bug Alert **
; ; If d5 was 0, it will be MI now,
; ; and will have to be handled later.
a4_even move.l d7,(a4)+ ; @ ok, ok, this looks kinda wasteful
move.l d7,(a4)+ ; @ Move 18 unrolled bytes to replace
move.l d7,(a4)+ ; @ addi.w #18,d5
move.l d7,(a4)+ ; @ I wanted this as fast as possible
move.w d7,(a4)+ ; @ and I hope no one misses 6 bytes
move.w d5,d4 ; @
bmi.s done19 ; @ Bug fix - d5<0 can only occur if an
; ; RLE length 19 occured on an odd adr.
; ; In this case, 19 bytes have already
; ; been moved. Last bug I hope.
and.w #3,d4 ; @ d4 = d5 modulo 4
lsr.w #2,d5 ; @ Adj d5 for move.l loop
subq.w #1,d5 ; @
bmi.s lr_lp2 ; @ If d5<4
lr_loop move.l d7,(a4)+ ; @ (.l)
dbf d5,lr_loop
lr_lp2 move.b d7,(a4)+ ; @ Finish remainder of cnt
dbf d4,lr_lp2 ; @
done19 moveq #0,d7 ; @ Clear top half of d7 again
bra.w mloop ; @ (.w) Main loop
no_lrle ; only one case left, long pattern
move.w d5,d6 ;
addq.w #3,d6 ; at least 3 chars long pattern
moveq #0,d7
move.b (a3)+,d7 ; Decode and add high and low
asl.w #4,d7 ; offset
add.w d7,d6 ;
moveq #0,d5
move.b (a3)+,d5 ; Next, get count
addi.w #15,d5 ; which is at least 16 (-1 for loop)
move.l a4,a6 ; source
sub.l d6,a6 ; destination
lsr.b #1,d6 ; @ If difference between a6&a4 is odd,
bcs.s rl_lp2 ; @ they are not on same boundary
move.w a6,d6 ; @ Otherwise, we can opt with move.l
lsr.b #1,d6 ; @
bcc.s a6_even ; @
move.b (a6)+,(a4)+ ; @ Move odd byte so both adrs are even
subq.w #1,d5 ; @ and adj count
a6_even move.w d5,d4 ; @
and.w #3,d5 ; @ d5 = d5 modulo 4
lsr.w #2,d4 ; @ Adj d4 for move.l loop
subq.w #1,d4 ; @
rl_loop move.l (a6)+,(a4)+ ; @
dbf d4,rl_loop ; @
rl_lp2 move.b (a6)+,(a4)+ ; @
dbf d5,rl_lp2 ; @
bra.w mloop
END